# By Shuiquan Tang, October 21, 2013

# LICENSE
# Copyright (c) Shuiquan Tang. All rights reserved.
# Copyright (c) Elizabeth Edwards' lab, Chemical Engineering and Applied Chemistry, University of Toronto. All rights reserved.

# Tested with inputs from different programs: (1) ABySS 1.3.4, (2) Allpaths-LG r44620, (3) SSPACE v. 2.0, (4) NCBI blast v. 2.2.27
# unitigs and contigs from ABySS, scaffolds from Allpaths-LG, contig links information from SSPAGE, searching with blast.


# Description
# Search for overlapping unitigs that can be used to close gaps between unitigs in a scaffold
# inputs: 1) building blocks: all ABySS unitigs 2) ABySS contigs, which have resolved some gaps further in unitigs 2) a gap table specifying gaps in a scaffold
# outputs: 1) solutions 2) bridging unitigs

#!/usr/bin/perl
#use strict;
#use warnings;

# 0. define global constants
my $edge_size = 100;
# the size of unitig edges to be extracted for searching.
#This value is better to be smaller than half of the size of the smallest unitig within the scaffold; in this case, the unitigs in scaffold are no less than 500 bp.
# Therectially, this value can be just the length of Kmer size used in ABySS, which is 90bp in the test case, but it is better to be larger because the readepth of a contig is more accurate when the contig is bigger.
# Readepth information is used to filter repetitive contigs (readepth higher than the average) when building scaffolds with untigs and using the Allpaths-LG output as the reference.

my $min_identity = 100;
# the minimum identity for blast hits. In previous versions of this program, this value is allowed to be lower than 100 so that inperfected overlapping can be captured.
# The current version seeks for overlappings between unitigs; such overlappings should and can have 100% identity.

my $blast_word_size = 23;
# this parameter is very important. when it is too high, there might be no solutions; when it is low, new contigs with shorter overlapping will come up and complicate the solutions.

# default value is 33. overlappings with length lower than this value won't be captured. Decrease this value if shorter overlappings are to be condisered.

my $blast_thread_number = 3;
# number of threads to be used in blast search, increase this number if you have more cpu cores.
# more information about blast settings: http://dpdb.uab.cat/dpdb/help_blast.asp

my $maximum_muscle_alignment_length=3000;

my $gap_size_deviation_factor = 1000;
# when the extension size is larger than $estimate_gap_size + $gap_size_deviation_factor, the path will be discarded.
# the standard deviation from gap size generated from Allpaths-LG is accrate in most cases, but not in all.
# Allowing a larger deviation (such as 2000 bp) helps capture all potential solutions.
# this parameter is used to control the searching range and to determine if a path is a solution.

# 1. define inputs and outputs
# -u ABySS unitig file, -c ABySS contig file, -g gap file, -b a controller of forcing to redo blast or not.
# e.g. -u NewUnitigs.fasta -c 1.JGI_aceto_k90n10_contigs.fa -g gaps.txt -b 1

#require "getopts.pl";
use vars qw($opt_u $opt_c $opt_g $opt_m $opt_p $opt_b $opt_i);
#$opt_u unitigs file, $opt_c ABySS contig file, $opt_g gap description file, $opt_m mate-pair links, $opt_p paired-end links, $opt_b controller to redo blast, $opt_i controller to integrate output

&Getopts('u:c:g:m:p:b:i:');
open (my $UnitigFile, "<$opt_u") or die "fail to open $opt_u\n";
open (my $ContigFile, "<$opt_c") or die "fail to open $opt_c\n";
open (my $GapFile, "<$opt_g") or die "fail to open $opt_g\n";

if (!$opt_b){
    $opt_b=1;
    }elsif($opt_b!=0 and $opt_b!=1){
    $opt_b = 1;
}
if (!$opt_i){
    $opt_i=1;
    }elsif($opt_i!=0 and $opt_i!=1){
    $opt_i = 1;
} # integrate or solution into one alignment or not; 1, integrate, 0 not.

# controller of forcing to do blast (1) or not (0). 0 is default, with which the program will do blast if certain blast-files are not present.

# create ouput folder if not existing;

my $files = qx(ls);
if ($files =~ /outputs/){ # another output folder
    system("rm outputs -r");
}
system("mkdir -p outputs/gap_solutions");
system("mkdir -p outputs/paths");
system("mkdir -p outputs/steps");

# 2. read inputs
#  read unitig file
#  in the unitig fasta file, the identifier of a unitig should be like >001 size coverage

my %unitig_info;
my $seq='';
my $id='';
my $TotUnitig=0;
while (my $line = <$UnitigFile>){
   chomp $line;
   if ($line =~ /^\>/){ 
      my @headings = split(/\ |\>/, $line);
      $id=$headings[1];
      $unitig_info{$id}{'size'}= $headings[2];# 
      $unitig_info{$id}{'coverage'}=$headings[3];
      $seq=''; # empty the sequence carrier
      $TotUnitig++;
   }else{
      $seq = $seq.$line; # concatenate sequences from multiple lines if existing
      $unitig_info{$id}{'seq'}=$seq; # update
   }
}

# read gap file, generated from scaffolding information
# each line of the gap file should be like: 001-G-002\t001\t002\t2000\t100\tAGTC...ATCC\n

my %gap_info;
my @gaps;
my %unitigs_sca;
my $heading = <$GapFile>;
my %order;
while (my $line = <$GapFile>){
   chomp $line;
   my @coln = split(/\t/,$line); 
   # input gap information
   $gap_info{$coln[0]}{'from'}=$coln[1];
   $gap_info{$coln[0]}{'to'}=$coln[2];
   $gap_info{$coln[0]}{'size'}=$coln[3];
   $gap_info{$coln[0]}{'std'}=$coln[4];
   $gap_info{$coln[0]}{'left_cut'}=$coln[5];
   $gap_info{$coln[0]}{'right_cut'}=$coln[6];
   $gap_info{$coln[0]}{'allpaths'}=$coln[7];
   push (@gaps, $coln[0]);# save a list of gaps
   $unitigs_sca{$coln[1]}++;# save a hash of unitigs to be scaffold
   $unitigs_sca{$coln[2]}++;# save a hash of unitigs to be scaffold
   $order{$coln[1]}{'to'}=$coln[2];
   $order{$coln[2]}{'from'}=$coln[1];
}
# read ABySS contigs file.
my %contigs;
while (my $line = <$ContigFile>){
   chomp $line;
   if ($line =~ /^\>/){ 
      my @headings = split(/\ |\>/, $line);
      $id=$headings[1];
      $seq=''; # empty the sequence carrier
   }else{
      $seq = $seq.$line; # concatenate sequences from multiple lines if existing
      $contigs{$id}=$seq; # update
   }
}

#3. self-blast: blast all unitigs (including edges of all unitigs within the scaffold) against all unitigs to obtain overlapping relationships.
my %edges; # the hash of edge sequences of unitigs of the scaffold, key: {001W}, {size/coverage}

foreach my $tig (keys%unitigs_sca){
   $edges{$tig.'W'}=substr($unitig_info{$tig}{'seq'},0,$edge_size);
   $unitig_info{$tig.'W'}{'seq'} = $edges{$tig.'W'};
   $unitig_info{$tig.'W'}{'size'} = $edge_size;
   $unitig_info{$tig.'W'}{'coverage'} = $unitig_info{$tig}{'coverage'};
   $edges{$tig.'E'}=substr($unitig_info{$tig}{'seq'},$unitig_info{$tig}{'size'}-$edge_size,$edge_size);
   $unitig_info{$tig.'E'}{'seq'} = $edges{$tig.'E'};
   $unitig_info{$tig.'E'}{'size'} = $edge_size;
   $unitig_info{$tig.'E'}{'coverage'} = $unitig_info{$tig}{'coverage'};
}
# create a new hash of "steps" with the key as stepID and values of size and sequence
my %steps;
foreach my $tig (keys%unitig_info){
    $steps{$tig.'F'}{'size'}=$unitig_info{$tig}{'size'};
    $steps{$tig.'F'}{'coverage'}=$unitig_info{$tig}{'coverage'};
    $steps{$tig.'F'}{'seq'}=$unitig_info{$tig}{'seq'};
    $steps{$tig.'R'}{'size'}=$unitig_info{$tig}{'size'};
    $steps{$tig.'R'}{'coverage'}=$unitig_info{$tig}{'coverage'};
    $steps{$tig.'R'}{'seq'}=RC_DNA($unitig_info{$tig}{'seq'});
}

# create a new unitig file and append edge sequences, whole unitigs within scaffolds were excluded.
my $LocalFiles = qx(ls);
my $UniEdgeFile = "unitigs_and_edges.fasta";
if (!($LocalFiles =~ /$UniEdgeFile/) or ($opt_b eq 1)){
# The blast process could be very time-consuming;
# this line checks if the blast has been done earlier, if so you can choose to use the existing file;
# you can also force the program to run blast and overwrite previous results by using "-b 1"
   open (my $f1, ">$UniEdgeFile"); # append edge sequences to the new unitig file
   foreach my $tig (keys%edges){
      printf($f1 "\>$tig\n$edges{$tig}\n");
   }
   foreach my $tig (keys%unitig_info){
        if ($tig =~ /[WE]$/){ # exclude putting into edges again
            next;
        }
        my $seq = $unitig_info{$tig}{'seq'};
        printf($f1 "\>$tig\n$seq\n");
   }
   close $f1;
   system ("makeblastdb -in $UniEdgeFile -dbtype nucl"); # format blast database
   system ("blastn -task megablast -query $UniEdgeFile -db $UniEdgeFile -out uxu.txt -outfmt 6 -num_threads $blast_thread_number -perc_identity $min_identity -word_size $blast_word_size");
   # run blast, use megablast and control the results by -perc_identity. use -outfmt 6
   # uxu.txt means unitigs+edges versus unitigs+edges.
   system ("makeblastdb -in $opt_c -dbtype nucl");
   system ("blastn -task megablast -query $opt_u -db $opt_c -out uxc.txt -outfmt 6 -num_threads $blast_thread_number -max_target_seqs 2");
   # uxc.txt means unitigs+edges versus contigs.
}
$LocalFiles = qx(ls);
if (!($LocalFiles =~ /uxu\.txt/)){ # a considerate measure
   die("restart by adding -b=1");
}

#3c. read blast ouputs.
# in thie step, the program read the blast result and save overlapping information into a hash of %overlap.
my %overlap; # this array represents (query-start, quesry-end, target-start, target-end, the target);

my %recorder; # record the unique correlation between query and hit
parseblast("uxu.txt", \%overlap); # the process was done in this subroutine.
TrimRedundancy (\%overlap);
sub TrimRedundancy {
    foreach my $tig1 (keys%overlap){
        my @reductant=();
        foreach my $tig2 (keys%{$overlap{$tig1}}){
            my $a = $overlap{$tig1}{$tig2};
            foreach my $tig3 (keys%{$overlap{$tig2}}){
                my $b = $overlap{$tig2}{$tig3};
                if ($overlap{$tig1}{$tig3}) {
                    my $c = $overlap{$tig1}{$tig3};
                    if ($$c[3]==($$a[3]-$$b[0]+1) and $$c[0]==($$a[1]-$$a[3]+$$b[0])) {
                        push (@reductant, $tig3);
                    }
                }              
            }
        }
        foreach my $tig3 (@reductant){
            delete($overlap{$tig1}{$tig3});
        }
    }
}

my %uxc;
parseblast("uxc.txt", \%uxc); # parse the results of unitigs blast against contigs.

#3d read paired-end and matepair links

my %MPlinks;
CollectLinks($opt_m, \%MPlinks);
my %PElinks;
CollectLinks($opt_p, \%PElinks);
sub CollectLinks{
    my $LinkFile = shift;
    my $links=shift;
    if ($LinkFile) {
        open (my $f1, "<$LinkFile") or die "fail to open $LinkFile\n";
        while (my $line = <$f1>) {
            chomp $line;
            if ($line =~/^#/) {
                next;
            }
            my @line = split(/\t/, $line);
            my $from = $line[0];
            my $to = $line[1];
            my $LinkNum = $line[2];
            my $dist= $line[9];
            $links->{$from}->{$to}->{'link'}=$LinkNum;
            $links->{$from}->{$to}->{'dist'}=$dist;
        }
        close $f1;
    }   
}





#4. Search and print results
#here is the actual searching process.

# before the iteration for different gaps, describe some global variables
my $gap; # the id of the gap, such as 001-G-002
my $start; # e.g. 001EF
my $startx;
my $end; # e.g. 002WF
my $endx;
my $gap_size_deviation;
my @solutions; # an array of references of different selected paths
my %bridging_unitigs; # a hash of unitigs derived from the solutions to a gap.
my %ties;
my %loops; # 'a'-> the arrary of different loops; a loop is like a->b->c->a or a->d->e->a, or a->a
my %loop_paths;
my %solution_size;
my %Dist;
my $improved;
my $fst_paths;
my $repeated;
my %outputs;
my @cats= ('abyss+allpaths','abyss_only','allpaths_only','single','multiple','repeated','no_solution');
my %unitigs_in_solutions;
foreach my $cat (@cats){
    system("mkdir outputs/gap_solutions/$cat");
}

# iteration to all gaps
foreach my $i (@gaps){
    my $core=(); my $links=();
    $gap = $i; # gap id
        #if ($gap ne '060-G-061'){next} # focus on one gap for debugging
    print("---------------$gap--------------\n"); # print this to standard ouput.
    #if ($gap eq '0055-G-0056'){next} # focus on one gap for debugging
    #if ($gap ne '1062-G-1063'){next}
    #if ($gap eq '0286-G-0287'){next}
    #if ($gap eq '0347-G-0348'){next}
    #if ($gap eq '0123-G-0124'){next}
    #if ($gap eq '0185-G-0186'){next}
    #if ($gap eq '0124-G-0125'){next}
    #if ($gap eq '0140-G-0141'){next}
    #if ($gap eq '0439-G-0440'){next}
    #if ($gap eq '0093-G-0094'){next}
    #if ($gap ne '0017-G-0018'){next}
    if (check_ABySS_contigs()){
        print("Resolved with ABySS contigs\n");
        next;
    }
    if ($gap_info{$gap}{'std'}==0) {
        print("Resolved with Allpaths-LG contigs\n");
        AllpathsOutput($gap);
        next;
    }
    $start = $gap_info{$gap}{'from'}."E"."F"; # the edge sequence of the proceeding unitig, e.g. 001EF, which represents the 3' edge ('E') of unitig 001 in Forword (F) direction
    $end = $gap_info{$gap}{'to'}."W"."F"; # the edge sequence of the succeeding unitig, e.g. 002WF, which represents the 5' edge ('E') of unitig 001 in Forword (F) direction
    $startx = $start;
    $endx = $end;
    Initialization();
    my $gap_size_deviation = $gap_size_deviation_factor;
    my $space = \%overlap;
    Search($fst_paths, $space, $gap_size_deviation,1); # search for solutions to a gap and potential loops identified in the search.
    if (!@solutions) {
        Initialization();
        $gap_size_deviation = $gap_size_deviation_factor+500;# lower the restriction of gap size requirement
        Search($fst_paths, $space, $gap_size_deviation,1); #search again
    }
    fst_output(\@solutions);
    CountDistance (\@solutions);
    my $num = scalar(@solutions);
    $repeated=HasLoop(\@solutions);
    if ($repeated) {
        $repeated=MarkRepeats(\@solutions, $gap_size_deviation);
    }
    print ("$num\n");
    if (!@solutions) {
        print("No solutions found\n");
        output_NA($gap);
        next;
    }

    
    # step 2: condense the ties and update $start and $end
    print("step2\t");
    my %searched_steps;
    do {$improved =0;
        HoriCondenseTies($start,\%searched_steps);
        %searched_steps = ();
        VertiCondenseTies($start,\%searched_steps);
        %searched_steps = ();
    }while($improved);
    

    # step 3: # update search space, change search mode to exhaustive and redo search
    print("step3\n");
    $space = UpdateSpace(\%ties);
    Initialization();
    if ($start ne $end) {
        Search($fst_paths, $space, $gap_size_deviation+500,1);
        ($core, $links)=CondensePaths(\@solutions);
        if ($opt_m) {
            $links = MatePairRefine($core, $links, \%MPlinks);
        }
        # step 6: output final results
        $core = FinalCondense($core, $links);
    }else{
        $core = [$start];
        $links =();
    }
    # step 5: use mate-pair links to help resolve non-unique links between concensus steps
    output_B ($core, $links);
    %loops=();
    %loop_paths=();
}

# output subroutines
sub fst_output{
   my $bridges = shift;
   my $file_paths = $gap."\.txt";
   my $file_unitigs = $gap."\.fasta";
   open (my $f1, ">outputs/paths/$file_paths");
    my %loops_in_solutions;
   print($f1 "scaffolding_size\tscaffolding_size_deviation\tsolution_size\tsolutions\n");
   if (!@$bridges){
      print($f1 "no results!\n");
      close $f1;
      open (my $f2, ">outputs/steps/$file_unitigs");
      my @edge = split(/\-/, $gap);
      my $start = $edge[0].'E';
      my $end = $edge[2].'W';
      print ($f2 "\>$start"."F-$edge_size-$unitig_info{$edge[0]}{'coverage'}\n$edges{$start}\n");
      print ($f2 "\>$end"."F-$edge_size-$unitig_info{$edge[2]}{'coverage'}\n$edges{$end}\n");
      print ($f2 "\>$gap-allpaths\n$gap_info{$gap}{'allpaths'}\n");
      close $f2;
   }else{
      open (my $f2, ">>outputs/steps/$file_unitigs");
      my %allunitigs;
      foreach my $path (@$bridges){
         my $scaffolding_size = $gap_info{$gap}{'size'};
         my $scaffolding_std = $gap_info{$gap}{'std'};
         my $size = EstimateDist($path,$start,$end);
         my @steps;
         foreach my $step (@$path){
            my $unitig = $step -> [0];
            $allunitigs{$unitig}++;
            if ($unitig=~/[WE]/) {
            }else{
                $unitigs_in_solutions{substr($unitig, 0, length($unitig)-1)}++;
            }
            
            
            if ($loops{$unitig}){
                $loops_in_solutions{$unitig}++;
                $unitig = "($unitig)";
            }
            push (@steps, $unitig);  
         }
         my $path2 = join ('-', @steps);
         print($f1 "$scaffolding_size\t$scaffolding_std\t$size\t$path2\n");
         print("$path2\n");
      }
      foreach my $repeat (keys(%loops_in_solutions)){
        print($f1 "Loops of $repeat\n");
        foreach my $loop (@{$loops{$repeat}}){
            my @steps;
            foreach my $step (@$loop){
                push (@steps, $step->[0]);
                $allunitigs{$step->[0]}++;
                if ($step->[0]=~/[WE]/) {
                    
                }else{
                    $unitigs_in_solutions{substr($step->[0], 0, length($step->[0])-1)}++;
                }
                
                
            }
            my $path = join ('-', @steps);
            print($f1 "$path\n");
        }
      }
      close $f1;
      foreach my $unitig (keys%allunitigs){
         my $id = substr($unitig, 0, length($unitig)-1);
         my $FR = substr($unitig,length($unitig)-1, 1);
         my $seq;
         my $size;
         my $coverage;
         if ($id =~ /[WE]/){
            if ($FR eq "F"){
               $seq = $edges{$id};
            }elsif ($FR eq "R"){
               $seq = RC_DNA($edges{$id});
            }
            $id = substr($unitig, 0, length($id)-1);
            $size = $edge_size;
            $coverage = $unitig_info{$id}{'coverage'};
         }else{
            if ($FR eq "F"){
               $seq = $unitig_info{$id}{'seq'};
            }elsif ($FR eq "R"){
               $seq = RC_DNA($unitig_info{$id}{'seq'});
            }
            $size = length ($seq);
            $coverage = $unitig_info{$id}{'coverage'};
         }
         print ($f2 "\>$unitig-$size-$coverage\n$seq\n");
         
      }
      if ($gap_info{$gap}{'allpaths'} ne '-') {
        print ($f2 "\>$gap-allpaths\n$gap_info{$gap}{'allpaths'}\n");
      }
      close $f2;
   }
}
# final outputs

Integration(\%outputs);

Output_unitigs(\%unitigs_in_solutions);

sub Output_unitigs{
    my $unitigs = shift;
    open(my $f1, ">outputs/related.unitigs.fasta") or die;
    foreach my $unitig (sort {$a<=>$b}keys%unitigs_sca){
        my $id = $unitig;
        my $size = $unitig_info{$unitig}{'size'};
        my $kmer_depth = $unitig_info{$unitig}{'coverage'};
        my $seq = $unitig_info{$unitig}{'seq'};
        print($f1 ">$id $size $kmer_depth\n$seq\n");
    }
    foreach my $unitig (sort {$a<=>$b}keys%$unitigs){
        if ($unitigs_sca{$unitig} or ($unitig=~/[EW]$/)) {
            next;
        }
        my $id = $unitig;
        my $size = $unitig_info{$unitig}{'size'};
        my $kmer_depth = $unitig_info{$unitig}{'coverage'};
        my $seq = $unitig_info{$unitig}{'seq'};
        print($f1 ">$id $size $kmer_depth\n$seq\n");
        
    }
    
}


sub Integration{# 1. trim contigs and cancatenate the genome, generate an alignment file, a genbank file of the concensus, a summay txt file, a excel statistic file.
    my %outputs = %{$_[0]};
    #step 1: generate statistic files: gaps_info.txt category_info.txt
    my @cats = ('abyss+allpaths','abyss_only', 'allpaths_only', 'single','multiple','repeated','no_solution');
    open(my $f1, ">outputs/category_info") or die;
    print($f1 "gap_category\tgap_no\tgap_list\n");
    my %gap_sol;
    foreach my $cat (@cats){
        my %gaps;
        my $gapnum;
        my $gaplist;
        if ($outputs{$cat}) {
            %gaps = %{$outputs{$cat}};
            $gapnum = scalar(keys%gaps);
            $gaplist=join(' ', keys%gaps);
            foreach my $gap (keys%gaps){
                %{$gap_sol{$gap}}=%{$gaps{$gap}};
                $gap_sol{$gap}{'cat'}=$cat;
            }
        }else{
            $gapnum=0;
            $gaplist='';
        }
        print($f1 "$cat\t$gapnum\t$gaplist\n");
    }
    close $f1;
    open(my $f2, ">outputs/gap_info.txt") or die;
    open(my $f4,">outputs/Modidfied_unitigs_and_solutions_A.txt") or die;
    open(my $f5,">outputs/Modidfied_unitigs_and_solutions_B.txt") or die;
    my @headline = qw(id size_est std_est size left_cut right_cut category seq/concensus);
    my $headline = join ("\t", @headline);
    print($f2 "$headline\n");
    my %trim_unitigs;
    foreach my $gap (@gaps){
        my $id = $gap;
        my $size_est = $gap_sol{$gap}{'est_size'};
        my $size = $gap_sol{$gap}{'size'};
        my $std_est = $gap_info{$gap}{'std'};
        my $left_cut = $gap_sol{$gap}{'left_cut'};
        my $right_cut = $gap_sol{$gap}{'right_cut'};
        my $cat = $gap_sol{$gap}{'cat'};
        my $concensus = $gap_sol{$gap}{'con'};
        print($f2 "$id\t$size_est\t$std_est\t$size\t$left_cut\t$right_cut\t$cat\t$concensus\n");
        
        my $pro_tig = $gap_info{$id}{'from'};
        my $suc_tig = $gap_info{$id}{'to'};
        $trim_unitigs{$pro_tig}{'3'}=$left_cut;
        $trim_unitigs{$suc_tig}{'5'}=$right_cut;
        if (($cat eq 'multiple') or ($cat eq 'no_solution') or ($cat eq 'repeated')) {
            print ($f5 ">$id,$cat,$left_cut,$right_cut\n$concensus\n");
        }else{
            print ($f4 ">$id,$cat,$left_cut,$right_cut\n$concensus\n");
        }
    }
    
    foreach my $tig (keys%trim_unitigs){
        my $trim5 = 0;
        my $trim3 = 0;
        if ($trim_unitigs{$tig}{'5'}) {
            $trim5 = $trim_unitigs{$tig}{'5'};
        }
        if ($trim_unitigs{$tig}{'3'}) {
            $trim3 = $trim_unitigs{$tig}{'3'};
        }
        my $seq = $unitig_info{$tig}{'seq'};
        $seq = substr($seq, $trim5, length($seq)-$trim5-$trim3);
        print ($f4 ">$tig,-$trim5,-$trim3\n$seq\n");
    }
    
    close $f2;
    close $f4;
    close $f5;
    if ($opt_i==0) {
        return;
    }
    #step 2: output alignment file
    my @tigs_sca = sort {$a<=>$b}keys%unitigs_sca;
    my %tigs_trim;
    my @seq_order;
    my %aln_seqs;
    my $left_cut=0;
    my $right_cut=0;
    my $size=0;
    for (my $i=0; $i<scalar@tigs_sca;$i++){
        my $tig1=$tigs_sca[$i];
        my $tig1_seq=$unitig_info{$tig1}{'seq'};
        my $tig1_size=$unitig_info{$tig1}{'size'};
        foreach my $tig (@seq_order){
            $aln_seqs{$tig}.='-'x($tig1_size-$right_cut);
        }
        $tig1_seq='-'x($size-$right_cut).$tig1_seq;
        $tig1_size = length($tig1_seq);
        push (@seq_order, $tig1);
        $aln_seqs{$tig1}=$tig1_seq;
        my $tig2;
        if ($i==(scalar@tigs_sca-1)) {
            $tig2=$tigs_sca[0];
        }else{
            $tig2=$tigs_sca[$i+1];
        }
        my $gap = $tig1.'-G-'.$tig2;
        my @sol_order;
        my %sol;
        if ($gap_sol{$gap}) {
            @sol_order = @{$gap_sol{$gap}{'order'}};
            $left_cut = $gap_sol{$gap}{'left_cut'};
            $right_cut = $gap_sol{$gap}{'right_cut'};
            %sol=%{$gap_sol{$gap}{'seqs'}};
            my $sol_size = length($sol{$sol_order[0]});
            foreach my $tig (@seq_order){
                $aln_seqs{$tig}.='-'x($sol_size-$left_cut);
            }
            for (my $i=0; $i<scalar(@sol_order);$i++){
                my $step =  $sol_order[$i];
                my $seq = $sol{$step};
                my $index='';
                if (scalar@sol_order>1) {
                    $index='_'.($i+1);
                }
                my $newid = $gap.$index;
                push (@seq_order, $newid);
                $seq='-'x($tig1_size-$left_cut).$seq;
                $size = length($seq);
                $aln_seqs{$newid}=$seq;
            }
            
        }
        
    }
    open(my $f3, ">outputs/FinalAlignment.afa") or die;
    foreach my $i (@seq_order){
        my @id = split(/\_/, $i);
        if ($id[0]=~/G/) {
            my $cat = $gap_sol{$id[0]}{'cat'};
            print($f3 "\>$i,$cat\n$aln_seqs{$i}\n");
        }else{
            print($f3 "\>$i\n$aln_seqs{$i}\n");
        }
        
    }
    close $f3;
    #step 3: trim contigs within scaffolds, concatenate, and output the genbank file
    
}

# dealing with repeats in solutions
sub HasLoop{
    my @solutions = @{$_[0]};
      foreach my $path (@solutions){
         my $scaffolding_size = $gap_info{$gap}{'size'};
         my $scaffolding_std = $gap_info{$gap}{'std'};
         my @steps;
         foreach my $step (@$path){
            my $unitig = $step -> [0];
            if ($loops{$unitig}){
                return 1;
            }
         }
      }
    return 0;
}

sub MarkRepeats{
    my @solutions = @{$_[0]};
    my $deviation=$_[1];
    my $n =1;
    %ties=();
    my %repeats;
    for (my $i=0;$i<scalar(@solutions);$i++){
        my $path = $solutions[$i];
        for (my $j=0; $j<scalar(@$path);$j++){
            my $step = $path->[$j]->[0];
            my $m =$i+1;
            my $n =$j+1;
            if ($loops{$step}) {
                foreach my $repeat (@{$loops{$step}}){
                    if (CheckRepeat($path,$repeat,$step,$deviation)) {
                        push (@{$repeats{$step}},$repeat);
                    }
                }
            }
        }
    }
    if (!%repeats) {
        foreach my $path (@solutions){
            StoreTies($path);
        }
        return 0;
        
    }else{
        for (my $i=0;$i<scalar(@solutions);$i++){
            my $path = $solutions[$i];
            for (my $j=0; $j<scalar(@$path);$j++){
                my $step = $path->[$j]->[0];
                my $m =$i+1;
                    my $n =$j+1;
                    if ($repeats{$step}) {
                        my $newstep = $step."_$m$n";
                        $steps{$newstep}{'seq'}=$steps{$step}{'seq'};
                        $steps{$newstep}{'size'}=$steps{$step}{'size'};
                        $path->[$j]->[0]=$newstep;
                    }
            }
        }
        foreach my $path (@solutions){
            StoreTies($path);
        }
        return 1;
    }
}

sub CheckRepeat{
    my @solution = @{$_[0]};
    
    my @repeat =@{$_[1]};
    
    my $step = $_[2];
    my $size_step = $steps{$step}{'size'};
    my $expec_size = $gap_info{$gap}{'size'};
    my $size =EstimateDist(\@solution,$startx,$endx);
    my $size_repeat= EstimateDist(\@repeat,$step,$step);
    my $expec_std=$_[3];
    if ($expec_std ==0) {
        $expec_std =20;
    }
    if ($size+$size_repeat+$size_step <=($expec_size+$expec_std)){
        return 1;
    }else{
        return 0;
    }
}

#subroutine for output
sub output_NA{
    my $gap =shift;
    my $gap_size = $gap_info{$gap}{'size'};
    my $seq;
    if ($gap_size<=0) {
        $seq = 'n';
    }else{
        $seq = 'N'x$gap_size;
    }
    my $start = $gap_info{$gap}{'from'}."E"."F"; # the edge sequence of the proceeding unitig, e.g. 001EF, which represents the 3' edge ('E') of unitig 001 in Forword (F) direction
    my $end = $gap_info{$gap}{'to'}."W"."F"; # the edge sequence of the succeeding unitig, e.g. 002WF, which represents the 5' edge ('E') of unitig 001 in Forword (F) direction
    my $start_seq = $steps{$start}{'seq'};
    my $end_seq = $steps{$end}{'seq'};
    $seq = $start_seq.$seq.$end_seq;
    my $cat = 'no_solution';
    Save($gap, $cat,'', $edge_size, $edge_size, $seq, {$gap=>$seq},[$gap]);
}

sub AllpathsOutput{
    my $gap = shift;
    my $seq = $gap_info{$gap}{'allpaths'};
    my $left_cut = $gap_info{$gap}{'left_cut'};
    my $right_cut = $gap_info{$gap}{'right_cut'};
    if (!$seq) {
        $seq ='';
    }
    my $gap_size=$gap_info{$gap}{'size'};
    my $cat = 'allpaths_only';
    my $size;
    Save($gap, $cat,$gap_size, $left_cut, $right_cut, $seq, {$gap=>$seq}, [$gap]);
}

sub Save{
    my $gap =shift; my $cat=shift; my $size= shift; my $left_cut =shift; my $right_cut=shift; my $con =shift; my $seqs =shift; my $seq_order=shift;
    my $file = "$gap,-$left_cut,-$right_cut.$cat";
    open(my $f1, ">outputs/gap_solutions/$cat/$file") or die;
    for (my $i=0; $i<scalar(@$seq_order);$i++){
        my $id = $seq_order->[$i];
        my $seq = $seqs->{$id};
        print($f1 "\>$id\n$seq\n");
    }
    close $f1;
    $outputs{$cat}{$gap}{'left_cut'}=$left_cut;
    $outputs{$cat}{$gap}{'right_cut'}=$right_cut;
    $outputs{$cat}{$gap}{'seqs'}=$seqs;
    $outputs{$cat}{$gap}{'con'}=$con;
    $outputs{$cat}{$gap}{'size'}=$size;
    $outputs{$cat}{$gap}{'est_size'} =$gap_info{$gap}{'size'};
    $outputs{$cat}{$gap}{'order'}=$seq_order;
}

sub FinalCondense{
    my $core = shift; my $links =shift;
    my @new_core;
    while (@$core) {
        my $mom = shift@$core;
        my $gdt = shift@$core;
        my $bridges = $links->{$mom};
        if (!$bridges) {
            push (@new_core, $mom);
        }elsif(scalar@$bridges>1) {
            push (@new_core, $mom);
            unshift(@$core, $gdt);
        }elsif (scalar@$bridges==1){
            my @cat_tigs = ($mom,$bridges->[0],$gdt);
            my $new = MultiCat(\@cat_tigs, '->');
            $links->{$new}=$links->{$gdt};
            delete($links->{$mom});
            delete($links->{$gdt});
            unshift(@$core, $new);
        }
    }
    return \@new_core;
}

sub output_B{ # output solutions generated from this method not from Abyss or Allpaths.
    my $core=shift;my $links=shift;
    my $concensus='';
    my $f1;
    my $cat;
    if ($repeated) {
        $cat = 'repeated';
        my @repeats = keys%loop_paths;
        $outputs{$cat}{$gap}{'repeats'}=\@repeats;
        print("Repeats!\n");
    }elsif(scalar(@$core)==1){
        $cat='single';
    }elsif(scalar(@$core)>1){
        $cat='multiple';
    }
    if (scalar(@$core)==1) {
        my $id = $core->[0];
        my $seq = $steps{$id}{'seq'};
        my $gap_size = length($seq)-2*$edge_size;
        Save($gap, $cat,$gap_size, $edge_size, $edge_size, $seq, {$id=>$seq}, [$id]);
        print("Single solution\n");
        return;
    }
    my @almts;
    my %almt_seqs;
    my $size = 0;
    my $max_in=0;
    my $max_out=0;
    for (my $i=0;$i<scalar(@$core);$i++){
        
        my $mom = $core->[$i];
        my $mom_seq = $steps{$mom}{'seq'};
        my $mom_size = $steps{$mom}{'size'};
        
        my $elong_mom = substr($mom_seq,$max_out,$mom_size-$max_out);
        $concensus=$concensus.$elong_mom;
        $mom_seq = '-'x($size-$max_out).$mom_seq;
        for (my $j=0; $j<scalar(@almts);$j++){
            $almt_seqs{$almts[$j]}=$almt_seqs{$almts[$j]}.'-'x($mom_size-$max_out);
        }
        push (@almts, $mom);
        $almt_seqs{$mom}=$mom_seq;
        my $dts = $links->{$mom};
        if (!$dts) {
            next;
        }
        my $gdt = $core->[$i+1];
        
        (my $dts1, $max_in,$max_out) = Synchonize($dts,$mom,$gdt); #$dts1 refers to a hash
        my $muscle_controller=1;
        foreach my $dt (values%{$dts1}){
            if (length $dt >$maximum_muscle_alignment_length) {
                $muscle_controller=0;
            }
        }
        
        my $dts_aligned;
        if ($muscle_controller ==0) {
            $dts_aligned = SimpleAlign($dts1,$max_in,$max_out);
        }else{
            $dts_aligned = RunMuscle($dts1);# $dts_aligned refers to a hash
        }
        my @dt_seqs = values%$dts_aligned;
        my $con = GenerateConcensus(\@dt_seqs);
        
        my $elong_dt = substr($con,$max_in,length($con)-$max_in);
        $concensus=$concensus.$elong_dt;
        my $dt_size = length($dt_seqs[0]);
        $mom_size = length($mom_seq);
        for (my $i=0; $i<scalar(@almts);$i++){
            $almt_seqs{$almts[$i]}=$almt_seqs{$almts[$i]}.'-'x($dt_size-$max_in);
        }
        foreach my $dt (keys%$dts_aligned){
            my $dt_seq = $$dts_aligned{$dt};
            $dt_seq = '-'x($mom_size-$max_in).$dt_seq;
            push(@almts, $dt);
            $almt_seqs{$dt}=$dt_seq;
        }
        $size = length($almt_seqs{$almts[0]});
    }
    my $gap_size = length($concensus)-2*$edge_size;
    Save($gap, $cat,$gap_size, $edge_size, $edge_size, $concensus, \%almt_seqs, \@almts);
    print("Multiple solutions\n");
}

sub SimpleAlign{
    my $seqs =shift;
    my $max_in=shift;
    my $max_out = shift;
    my %aligned;
    my @keys = sort{length($$seqs{$b})<=>length($$seqs{$a})} keys%$seqs;
    my $a = shift@keys; my $seq_a = $seqs->{$a}; my $size_a =length($seq_a);
    my $seq_in = substr($seq_a, 0, $max_in);
    my $seq_out = substr($seq_a, $size_a-$max_out, $max_out);
    $aligned{$a}=$seq_a;
    foreach my $i (@keys){
        my $seq = $seqs->{$i};
        my $size = length($seq);
        my $mid = substr($seq, $max_in, $size-$max_in-$max_out);
        my $new_seq = $seq_in.$mid.'-'x($size_a-$size).$seq_out;
        $aligned{$i}=$new_seq;
    }
    return \%aligned;
}

# subroutines to refine with mate-pair links
sub CountDistance{
    my @solutions = @{$_[0]};
    %Dist=();
    foreach my $path (@solutions){
        my $GapSize = EstimateDist($path,$startx, $endx);
        for (my $i=1;$i<scalar@$path;$i++){
            my $step = $path->[$i]->[0];
            $path->[$i]->[1]-> [4]= $path->[$i-1]->[1]->[4]+($path->[$i]->[1]->[0]-$path->[$i-1]->[1]->[2]);
            my $in = $path->[$i]->[1]-> [4];
            my $size_tig = $steps{$step}{'size'};
            my $out= $GapSize-$in-$size_tig;
            if (!$Dist{$step}) {
                $Dist{$step}{'pro'}=$in;
                $Dist{$step}{'suc'}=$out;
            }else{
                my $old_in= $Dist{$step}{'pro'};
                my $old_out = $Dist{$step}{'suc'};
                $Dist{$step}{'pro'} = ($old_in+$in)/2;
                $Dist{$step}{'suc'}= ($old_out+$out)/2;
            }
        }
    }
    
}

sub MatePairRefine{
    my $core =shift;my $links =shift; my $MPlinks=shift;
    my %new_links;    
    for (my $i=0;$i<scalar@$core;$i++){
        my $mom = $$core[$i]; my $gdt=$$core[$i+1];
        if (!$links->{$mom}) {
            next;
        }
        my @dts = @{$links->{$mom}};
        if (scalar@dts==1) {
            $new_links{$mom}=\@dts;
            next;
        }
        # assume there is only one unique solution; and it is the one with the highest MPlinks counts
        my %MPscore;
        my %KeySteps;
        foreach my $dt (@dts){
            if ($dt eq 'NONE') {
                $MPscore{$dt}=0;
                $KeySteps{$dt}=[];
            }else{
                ($MPscore{$dt}, $KeySteps{$dt})=CountMPlinks($dt, $MPlinks);
            }
        }
        my@keydts;
        foreach my $dt (@dts){
            if (scalar(@{$KeySteps{$dt}})>=1) {
                push (@keydts, $dt);
            }
        }
        my @dts_new;
        if (@keydts) {
            if (scalar(@keydts)==1) {
                $new_links{$mom}=[$keydts[0]];
            }elsif(scalar(@keydts)>1){
                @dts_new= sort{$MPscore{$b}<=>$MPscore{$a}} @keydts;
                if ($MPscore{$dts_new[0]}>$MPscore{$dts_new[1]}) {
                    $new_links{$mom}=[$dts_new[0]];
                }else{
                    $new_links{$mom}=\@dts;
                }
            }
            print ("MP_links help 1\n");
        
        }else{
            @dts_new= sort{$MPscore{$b}<=>$MPscore{$a}} keys%MPscore;
            if ($MPscore{$dts_new[0]}>$MPscore{$dts_new[1]}) {
                $new_links{$mom}=[$dts_new[0]];
                print ("MP_links help 2\n");
            }else{
                $new_links{$mom}=\@dts;
            }
        }
    }
    
    return \%new_links;
}

sub CountMPlinks{
    my $dt = shift; my $MPlinks=shift;
    my @tigs = split(/\-|x|\+|\(|\)/, $dt);
    my %tigs;
    my @KeySteps;
    foreach my $tig (@tigs){
        if ($tig) {
            my @tigid = split(/\_/,$tig);
            $tig = $tigid[0];
            $tigs{$tig}++;
        }
    }
    @tigs = keys%tigs;
    my $score=0;
    foreach my $tig (@tigs){

        my $start = $gap_info{$gap}{'from'};
        my $end = $gap_info{$gap}{'to'};
        my $MPsize = 10000; #10kb
        my $dist_pro = $MPsize-$Dist{$tig}{'pro'};
        my $dist_suc = $MPsize-$Dist{$tig}{'suc'};
        my $essen_score_pro=0;
        my $essen_score_suc=0;
        my $startz=$start;
        my $endz=$end;
        while ($dist_pro>0) {
            my $next_tig = $order{$startz}{'from'};
            my $size_gap = $gap_info{"$next_tig\-G\-$startz"}{size};
            $startz = $startz.'F';
            my $size_tig = $steps{$startz}{'size'};
            my $LinkCount;
            if ($MPlinks->{$startz}->{$tig}) {
                $LinkCount=$MPlinks->{$startz}->{$tig}->{'link'};
            }else{
                $LinkCount=0;
            }
            #if ($startz eq ($start.'F')) {
                $essen_score_pro = $LinkCount;
            #}
            $score += $LinkCount;
            $startz=$next_tig;
            $dist_pro= $dist_pro-$size_tig-$size_gap;
        }
        while ($dist_suc>0) {
            
            my $next_tig = $order{$endz}{'to'};
            my $size_gap = $gap_info{"$endz\-G\-$next_tig"}{size};
            $endz = $endz.'F';
            my $size_tig = $steps{$endz}{'size'};
            my $LinkCount;
            if ($MPlinks->{$tig}->{$endz}) {
                $LinkCount=$MPlinks->{$tig}->{$endz}->{'link'};
            }else{
                $LinkCount=0;
            }
            #if ($endz eq ($end.'F')) {
                $essen_score_suc = $LinkCount;
            #}
            $score += $LinkCount;
            $endz=$next_tig;
            $dist_suc= $dist_suc-$size_tig-$size_gap;
        }
        if ($essen_score_pro>0 and $essen_score_suc>0) {
            push(@KeySteps, $tig);
        }
    }
    return ($score,\@KeySteps);
}


sub Initialization{
    my $start_size =$steps{$start}{'size'};
    my $loci = [0,0,0,0,"-$start_size"]; # query-start, query_end, target_start, target_end, total extension size
    my $path = [[$start, $loci]]; # a path is a reference to the array of sequential steps; there is one step to start with.
    $fst_paths = [$path]; # $paths a reference to an array of different paths.
    @solutions =(); # empty solution array for the next gap.
    %bridging_unitigs = (); #empty bridging_unitigs hash for the next gap.
    %ties =();
    %solution_size =();
}

sub UpdateSpace{
    my $ties = shift;
    my %space;
    foreach my $step1 (keys%$ties){
        foreach my $step2 (keys%{$ties{$step1}{'out'}}){
            $space{$step1}{$step2} = $ties->{$step1}->{'out'}->{$step2};
        }
    }
    return \%space;
}

#subroutines to refine the ties
sub HoriCondenseTies{
    my $mom = shift;
    my $searched_steps =shift;
    if ($$searched_steps{$mom}) {return;}#
    my @dts= keys%{$ties{$mom}{'out'}};
    if (!@dts) {return;}
    my $dt_1st = $dts[0];
    my @moms = keys%{$ties{$dt_1st}{'in'}};
    my $newdt;
    $$searched_steps{$mom}++;
    if ((scalar @dts==1) and (scalar@moms==1)) {
        
        $newdt = MultiCat([$mom, $dt_1st],'+');
        $improved = 1;
        if ($mom eq $start) {
            $start = $newdt;
        }
        if ($dt_1st eq $end) {
            $end = $newdt;
        }
        HoriCondenseTies($newdt,$searched_steps,$improved);
    }else{
        foreach my $dt (@dts){
            HoriCondenseTies($dt,$searched_steps,$improved);
        }
    }
}

sub MultiCat{
    my @tigs=@{$_[0]};
    my $mark=$_[1];
    my $start = $tigs[0]; my $end =$tigs[$#tigs];
    my $new_id = join($mark,@tigs);
    my $new_seq = $steps{$tigs[0]}{'seq'};
    for(my $i=1;$i<scalar@tigs;$i++){
        my $seq=$steps{$tigs[$i]}{'seq'};
        my $size = $steps{$tigs[$i]}{'size'};
        my $loci = $ties{$tigs[$i-1]}{'out'}{$tigs[$i]};
        $seq = substr($seq,$$loci[3],$size-$$loci[3]);
        $new_seq = $new_seq.$seq;
    }
    my $new_size = length($new_seq);
    $steps{$new_id}{'seq'} = $new_seq;
    $steps{$new_id}{'size'} = $new_size;
    # update ties
    $ties{$new_id}{'in'}=$ties{$start}{'in'};
    foreach my $out (keys%{$ties{$end}{'out'}}){
        my $locib = $ties{$end}{'out'}{$out};
        $$locib[1] = $new_size;
        $$locib[0] = $$locib[1]-($$locib[3]-$$locib[2]);
    }
    $ties{$new_id}{'out'}=$ties{$end}{'out'};
    
    foreach my $step (keys%{$ties{$new_id}{'in'}}){
        $ties{$step}{'out'}{$new_id} = $ties{$new_id}{'in'}{$step};
        delete($ties{$step}{'out'}{$start});
    }
    foreach my $step (keys%{$ties{$new_id}{'out'}}){
        $ties{$step}{'in'}{$new_id} = $ties{$new_id}{'out'}{$step};
        delete($ties{$step}{'in'}{$end});
    }
    foreach my $tig (@tigs){
            delete($ties{$tig});
    }
    return $new_id;
}

sub VertiCondenseTies{
    my $mom = shift;
    my $searched_steps =shift;
    if ($$searched_steps{$mom}) {return;}#
    my @dts = keys%{$ties{$mom}{'out'}};
    my @gdts;
    my %gdts;
    my %xmom;
    if (scalar(@dts)>1) {
        foreach my $dt (@dts){
            foreach my $i (keys%{$ties{$dt}{'out'}}){
                push (@{$gdts{$i}},$dt);
            }
        }
        @gdts = keys%gdts;
        foreach my $gdt (@gdts){
            my @sub_dts = @{$gdts{$gdt}};
            my @dts_to_assemble;
            foreach my $dt (@sub_dts){
                my @out = keys%{$ties{$dt}{'out'}};
                my @in = keys%{$ties{$dt}{'in'}};
                if (scalar@in==1 and scalar@out==1) {
                    push(@dts_to_assemble, $dt);
                }
                
            }
            if (scalar@dts_to_assemble>1) {
                my $i = Assemble(\@dts_to_assemble,$mom,$gdt);
                if ($i){
                    $improved = 1;
                }
            }
        }
    }

    @dts = keys%{$ties{$mom}{'out'}}; #update and continue
    $$searched_steps{$mom}++;
    foreach my $dt (@dts){
        VertiCondenseTies($dt, $searched_steps, $improved);
    }

}

sub Assemble{
    my $dts = shift; my $mom=shift; my $gdt =shift;
    my ($dts1, $max_in,$max_out) = Synchonize($dts,$mom,$gdt); #$dts1 refers to a hash
    foreach my $seq (values%$dts1){
        if (length($seq)>$maximum_muscle_alignment_length) {
          print length($seq)."\n";
          return 0;
        }
    }
    my $dts_aligned = RunMuscle($dts1);# $dts_aligned refers to a hash
    my ($new_dts, $dts_to_remove) = Classify_Combine($dts_aligned);
            if (@$new_dts) {
                
                foreach my $dt (@$dts_to_remove){
                    delete($ties{$dt});
                    delete($ties{$gdt}{'in'}{$dt});
                    delete($ties{$mom}{'out'}{$dt});
                }
                foreach my $new_dt (@$new_dts){
                    my $size_dt = $steps{$new_dt}{'size'};
                    my $size_mom = $steps{$mom}{'size'};
                    my $loci_in = [$size_mom-$max_in+1,$size_mom,1,$max_in];
                    my $loci_out = [$size_dt-$max_out+1,$size_dt,1,$max_out];
                    $ties{$new_dt}{'in'}{$mom}=$loci_in;
                    $ties{$new_dt}{'out'}{$gdt} = $loci_out;
                    $ties{$mom}{'out'}{$new_dt}=$loci_in;
                    $ties{$gdt}{'in'}{$new_dt}=$loci_out;
                }
                return 1;
            }else{
                return 0;
            }
    
    
}

sub Synchonize{
    my $dts = shift; my $mom=shift; my $gdt =shift;
    my $mom_size = $steps{$mom}{'size'};
    my $mom_seq= $steps{$mom}{'seq'};
    my $gdt_seq= $steps{$gdt}{'seq'};
    my $max_in=0; my $max_out =0;
    foreach my $dt (@$dts){
        my $loci_in = $ties{$dt}{'in'}{$mom};
        my $loci_out = $ties{$dt}{'out'}{$gdt};
        if (($$loci_in[1]-$$loci_in[0]+1)>$max_in) {
            $max_in=$$loci_in[1]-$$loci_in[0]+1;
        }
        if (($$loci_out[3]-$$loci_out[2]+1)>$max_out) {
            $max_out=$$loci_out[3]-$$loci_out[2]+1;
        }
    }
    my $seq_in = substr($mom_seq,$mom_size-$max_in,$max_in);
    my $seq_out = substr($gdt_seq,0,$max_out);
    my %new_dts;
    foreach my $dt(@$dts){
        my $loci_in = $ties{$dt}{'in'}{$mom};
        my $loci_out = $ties{$dt}{'out'}{$gdt};
        my $dt_seq=$steps{$dt}{'seq'};
        my $seq = substr($dt_seq,$$loci_in[3],$$loci_out[0]-$$loci_in[3]-1);
        $seq = $seq_in.$seq.$seq_out;
        $new_dts{$dt}=$seq;
    }
    return (\%new_dts, $max_in, $max_out);
}

sub RunMuscle{
    my $dts = shift;
    open(my $f1, ">input.fasta") or die;
    foreach my $dt (keys%{$dts}){
        my $seq = $dts->{$dt};
        print($f1 "\>$dt\n$seq\n");
    }
    close $f1;
    system ("/data4/beem/ST_scripts/muscle3.8.31_i86linux64 -in input.fasta -out output.afa -quiet");
    open(my $f2, "<output.afa") or die;
    my %seq_aligned;
    while (my $line = <$f2>) {
        chomp $line;
        if ($line =~ /^\>/){ 
            my @headings = split(/\ |\>/, $line);
            $id=$headings[1];
            $seq=''; # empty the sequence carrier
        }else{
            $seq = $seq.$line; # concatenate sequences from multiple lines if existing
            $seq_aligned{$id}=$seq; # update
        }
    }
    return (\%seq_aligned);
}

sub Classify_Combine{
    my $seqs = shift; 
    my @keys = keys%{$seqs};
    my @groups;
    my $better=0;
    my @dts_to_remove;
    #step 1: classify
    do {
        my @assigned; my @remained;
        my $a = pop(@keys);
        my $seq_a = $seqs->{$a};
        push (@assigned, $a);
        foreach my $b (@keys){
            my $seq_b = $seqs->{$b};
            my @seqs = ($seq_a, $seq_b);
            my $concensus = GenerateConcensus (\@seqs);
            if (Judge($concensus)) {
                push (@assigned, $b);
            }else{
                push (@remained, $b);
            }
        }
        push (@groups, \@assigned);
        @keys=();
        @keys = @remained;
    }while(scalar@keys);
    #step 2: flatten
    my @products;
    foreach my $group (@groups){
        my @ids = @$group;
        if (scalar(@ids)==1) {
            next;
        }
        
        my $new_id;
        foreach my $i (@ids){
            my $j;
            if ($i =~ /[+x-]/) {
                $j = "\($i\)".'x';
            }else{
                $j = $i.'x';
            }
            $new_id= $new_id.$j;
        }
        $new_id = substr($new_id,0,length($new_id)-1);
        my @seqs;
        foreach my $i (@ids){
            push (@seqs, $seqs->{$i});
        }
        my $concensus = GenerateConcensus (\@seqs);
        if (Judge($concensus)) {
            $steps{$new_id}{'seq'}=$concensus;
            $steps{$new_id}{'size'}= length($concensus);
            push (@products, $new_id);
            push (@dts_to_remove, @ids);
        }
    }
    return (\@products, \@dts_to_remove);
}

sub GenerateConcensus{
    my $seqs = shift;
    my $product;
    for (my $i =0; $i<length($seqs->[0]);$i++){
        my %letters;
        foreach (my $j=0; $j<scalar(@$seqs);$j++){
            my $letter= substr($seqs->[$j],$i,1);
            $letters{$letter}++;
        }
        my @letters = keys%letters;
        my $x = CodonRules(\@letters);
        $product=$product.$x;
    }
    return $product;
}
sub CodonRules{
    my @letters = @{$_[0]};
    my %new;
    my $x;
    #decode
    foreach my $a (@letters){
        if ($a=~/[ATGC-]/) {
            $new{$a}++;
        }else{
            if ($a eq 'M') {
                $new{'A'}++;
                $new{'C'}++;
            }elsif($a eq 'R'){
                $new{'A'}++;
                $new{'G'}++;
            }elsif($a eq 'W'){
                $new{'A'}++;
                $new{'T'}++;
            }elsif($a eq 'S'){
                $new{'C'}++;
                $new{'G'}++;
            }elsif($a eq 'Y'){
                $new{'C'}++;
                $new{'T'}++;
            }elsif($a eq 'K'){
                $new{'G'}++;
                $new{'T'}++;
            }elsif($a eq 'V'){
                $new{'A'}++;
                $new{'C'}++;
                $new{'G'}++;
            }elsif($a eq 'H'){
                $new{'A'}++;
                $new{'C'}++;
                $new{'T'}++;
            }elsif($a eq 'D'){
                $new{'A'}++;
                $new{'G'}++;
                $new{'T'}++;
            }elsif($a eq 'B'){
                $new{'C'}++;
                $new{'G'}++;
                $new{'T'}++;
            }elsif($a eq 'N' or $a eq 'n'){
                return ('N');
            }
        }
    }
    my @new = keys%new;
    my $num=scalar(@new);
    if ($new{'-'}) {
        if ($num==1) {
            return ('');
        }else{
            return ('N');
        }
    }
    
    
    if ($num ==1) {
        return uc($new[0]);
    }elsif($num==2){
        if ($new{'A'} and $new{'C'}) {
            return('M');
        }elsif($new{'A'} and $new{'G'}){
            return('R');
        }elsif($new{'A'} and $new{'T'}){
            return('W');
        }elsif($new{'C'} and $new{'G'}){
            return('S');
        }elsif($new{'C'} and $new{'T'}){
            return('Y');
        }elsif($new{'G'} and $new{'T'}){
            return('K');
        }
    }elsif($num == 3){
        if (!$new{'T'}) {
            return('V');
        }elsif(!$new{'G'}){
            return('H');
        }elsif(!$new{'C'}){
            return('D');
        }elsif(!$new{'A'}){
            return('B');
        }
        
    }elsif($num>=4){
        return('N');
    }
    
}

sub Judge{
    my $seq = shift;
    if ($seq =~ /NNNN/) {
        return 0;
    }
    
    my $window=10;
    my $allow = 4;
    my $string = substr ($seq,0,$window);
    my $nonATGC=0;
    foreach (my $i=1;$i<$window;$i++){
        my $a = substr($string,$i,1);
        if (!$a =~ /[ATGC]/) {
            $nonATGC++;
        }
    }
    if ($nonATGC>=$allow) {
        return 0;
    }
    
    for (my $i=$window; $i<length($seq);$i++){
        my $in = substr($seq,$i,1);
        my $out = substr($string,0,1);
        $string=substr($string,1,$window-1).$in;
        if ($in eq 'M') {
            my $x=1;
        }
        
        if ($out =~ /[ATGC]/) {
            
        }else{
            $nonATGC--;
        }
        if ($in =~ /[ATGC]/) {
            
        }else{
            $nonATGC++;
        }
        if ($nonATGC>=$allow) {
            return 0;
        }
    }
    return 1;
}


#-----------------subroutines to further condense solutions------------------------

sub CondensePaths{
    my $paths = shift;
    my $concensus = GetSteps($paths->[0]);
    my %CoreBridges;
    if (!@$paths) { # if there is no paths to condense
        return;
    }
    my $num_paths = scalar(@$paths);
    if ($num_paths==1) { #if there is only one path, no need to condense.
        return ($concensus, \%CoreBridges);
    }
    # step 1: to condense solutions and to identify share nodes
    my $index;
    for (my $i=0; $i<scalar(@$paths); $i++){
        my $next_path = GetSteps($paths->[$i]);
        $index = Collapse($concensus,$next_path, $index);
    }
    # step 2: to identify unique paths between shared nodes
    ($concensus, $index)=Combine($concensus, $index, $num_paths);
    
    for (my $i=0; $i<scalar(@$concensus)-1; $i++){
        my $step = $$concensus[$i];
        my @tig1 = @{$index->[$i]};
        my @tig2 = @{$index->[$i+1]};
        my %bridges;
        for (my $j=0; $j<scalar(@tig1);$j++){
            my $path = $paths->[$j];
            my @subpath = @$path[$tig1[$j]..$tig2[$j]];
            my $bridge = GetSteps(\@subpath);
            my $id = join('-',@{$bridge});
            $bridges{$id}++;
        }
        my $bridges = [keys%bridges];
        #if (scalar(@$bridges)>=1) {
         #   $bridges=Assemble($bridges,\%ties); # assemble and combine different bridges between two shared nodes.
        #}
        $CoreBridges{$step}=$bridges;
    }
    # step 3: build new steps and update ties and assemble if possible
    for(my $i=0; $i<(scalar@$concensus-1); $i++){
        my $mom = $concensus-> [$i];
        my $gdt = $concensus->[$i+1];
        my $bridges = $CoreBridges{$mom};
        my @dts=BuildNewSteps($bridges, $mom, $gdt);
        Assemble(\@dts,$mom,$gdt);
        $CoreBridges{$mom} = [keys%{$ties{$mom}{'out'}}];
    }

    return ($concensus, \%CoreBridges);
}
sub BuildNewSteps{
    my $paths = shift;
    my $mom = shift;
    my $mom_seq=$steps{$mom}{'seq'};
    my $mom_size =$steps{$mom}{'size'};
    my $gdt = shift;
    my %dts;
    my %olddts;
    foreach my $path (@$paths){
        my @steps = split(/\-/, $path);
        shift@steps; pop@steps;
        if (!@steps) {
            my $dt = 'NONE';
            push (@steps, $dt);
            my $loci = $ties{$mom}{'out'}{$gdt};
            my $size = $$loci[1]-$$loci[0]+1;
            my $seq = substr($mom_seq, $mom_size-$size, $size);
            $steps{$dt}{'seq'}=$seq;
            $steps{$dt}{'size'}=$size;
            $ties{$mom}{'out'}{$dt}= [$$loci[0],$$loci[1],1,$size];
            $ties{$dt}{'in'}{$mom}= [$$loci[0],$$loci[1],1,$size];
            $ties{$gdt}{'in'}{$dt}= [1,$size,$$loci[2],$$loci[3]];
            $ties{$dt}{'out'}{$gdt}= [1,$size,$$loci[2],$$loci[3]];
            delete($ties{$mom}{'out'}{$gdt});
            delete($ties{$gdt}{'in'}{$mom});
        }
        if (scalar@steps==1) {
            $dts{$steps[0]}++;
        }else{
            my $dt = join('-', @steps);
            my $seq = $steps{$steps[0]}{'seq'};
            for (my $i=1; $i<scalar@steps;$i++){
                my $old = $steps[$i-1];
                my $new = $steps[$i];
                my $loci = $ties{$old}{'out'}{$new};
                my $newseq = $steps{$new}{'seq'};
                my $newsize =$steps{$new}{'size'};
                my $elong = substr($newseq, $$loci[3], $newsize-$$loci[3]);
                $seq = $seq.$elong;
            }
            my $size = length($seq);
            $steps{$dt}{'seq'}=$seq;
            $steps{$dt}{'size'}=$size;
            my $step_1st = $steps[0];
            my $step_last = $steps[$#steps];
            my @loci_in = @{$ties{$mom}{'out'}{$step_1st}};
            my @loci_out = @{$ties{$gdt}{'in'}{$step_last}};
            $loci_out[1]=$size;
            $loci_out[0] = $size-($loci_out[3]-$loci_out[2]);
            $ties{$mom}{'out'}{$dt}=\@loci_in;
            $ties{$gdt}{'in'}{$dt}=\@loci_out;
            $ties{$dt}{'in'}{$mom}=\@loci_in;
            $ties{$dt}{'out'}{$gdt}=\@loci_out;
            $dts{$dt}++;
            foreach my $step (@steps){
                $olddts{$step}++;
            }
        }
    }
    foreach my $step (keys%olddts){
        if (!$dts{$step}) {
            delete($ties{$step});
        }
    }
    foreach my $dt (keys(%{$ties{$mom}{'out'}})){
        if (!$dts{$dt}) {
            delete($ties{$mom}{'out'}{$dt});
        }
    }
    foreach my $dt (keys(%{$ties{$gdt}{'in'}})){
        if (!$dts{$dt}) {
            delete($ties{$mom}{'out'}{$dt});
        }
    }
    return (keys%dts);
}

sub Combine{
    my $concensus =shift;
    my $index = shift;
    my $num_paths = shift;
    my @concensus;
    my @index;
    for (my $i=0; $i<scalar(@$index); $i++){
        my @j =@{$index->[$i]};
        if (scalar(@j)==$num_paths) {
            push (@concensus, $concensus->[$i]);
            push (@index, \@j);
        }
    }
    return (\@concensus, \@index);
}

sub Collapse{
    my @path1 = @{$_[0]};
    my $size1 = scalar(@path1);
    my @path2 = @{$_[1]};
    my $size2 = scalar(@path2);
    my $i=0;
    my @dump1;
    my @dump2;
    my $index1;
    my $index2;
    my $index = $_[2];
    while (@path1 or @path2) { # iterate through two arrays and find the concensus
        my $step1;
        if (@path1) {
            $step1 = shift(@path1);
            push (@dump1, $step1);
        }
        my $step2;
        if (@path2) {
            $step2 = shift(@path2);
            push (@dump2, $step2);
        }
        my ($i,$j) = GetCommon(\@dump1,\@dump2);# the indexes of the shared step in the two arrays
        if ($i ne '' and $j ne '') {
            my @reimburse1;
            my @reimburse2;
            if ($i<$#dump1) {
                @reimburse1 = @dump1[$i+1..$#dump1];
                unshift(@path1,@reimburse1);
            }
            if ($j<$#dump2) {
                @reimburse2 = @dump2[$j+1..$#dump2];
                unshift(@path2,@reimburse2);
            }
            @dump1 =();
            @dump2=();
            $index1 = $size1 - scalar(@path1)-1;
            $index2 = $size2 - scalar(@path2)-1;
            push (@{$$index[$index1]},$index2);
        }
    }
    return $index;
}

sub GetCommon{
    my $a =shift;
    my $b =shift;
    my $index_a='';
    my $index_b='';
    my $count=0;
    for (my $i=0; $i<scalar(@$a);$i++){
        for (my $j=0; $j<scalar(@$b);$j++){
            if ($a->[$i] eq $b->[$j]) {
                $index_a =$i;
                $index_b =$j;
                $count++;
            }
        }
    }
    if ($count>=2) {
        $index_a ='';
        $index_b ='';
    }
    return ($index_a, $index_b);
}

sub GetSteps{
    my $path = shift;
    my @steps;
    foreach my $i (@$path){
        my $step = $i ->[0];
        push (@steps, $step);
    }
    return (\@steps);
}


#-----------------subroutines to check if the gap between two unitigs are resolved in ABySS contigs.-----------
sub check_ABySS_contigs{
    my $start_unitig = $gap_info{$gap}{'from'}.'F';
    my $end_unitig = $gap_info{$gap}{'to'}.'F';
    my @start_contig = find_contigs($start_unitig); #contigID \t target_start \t target_end
    my @end_contig = find_contigs($end_unitig);
    if (($start_contig[4] eq $end_contig[4]) and ($start_contig[4] ne '')){
        if (print_contig_solution (\@start_contig, \@end_contig)){
            return 1;    
        }else{
            return 0;
        };
    }else{
        return 0;
    }
}

sub find_contigs{
    my $unitig = shift; #001F
    my $ID = substr($unitig,0,length($unitig)-1);
    my $size_u = $unitig_info{$ID}{'size'};
    my @new;
    if (!$uxc{$unitig}) {
        @new= (0,0,0,0,'');
        return @new;
    }
    my @contigs = keys%{$uxc{$unitig}};#
    my $contig = shift@contigs; # retrive the first hit
    my $loci = $uxc{$unitig}{$contig};
    
    if ($size_u == ($$loci[1]-$$loci[0]+1)){ # if the whole contig was included in the alignment
       @new = ($$loci[0], $$loci[1], $$loci[2], $$loci[3],$contig);
    }else{
        @new= (0,0,0,0,'');
    }
    return @new;
}

sub print_contig_solution{
    my @start = @{$_[0]};
    my @end = @{$_[1]};
    my $overlap = $end[2]-$start[3];
    my $contig = substr($start[4],0,length($start[4])-1);
    my $contig_seq = $contigs{$contig};
    my $direction = substr($start[4],length($start[4])-1,1);
    if ($direction eq "R"){
        $contig_seq = RC_DNA($contig_seq); # reverse the contig if it is in reverse orientation when overlapping with the unitigs
    }
    my $cut;
    my $seq ='';
    
    if ($overlap >1){
            $seq = substr($contig_seq, $start[3],$overlap-1);
            $cut =0;
    }elsif ($overlap < 1){
            $seq = substr($contig_seq, $end[2]-1,abs($overlap-1));
            $cut = abs($overlap-1);
    }else{
        $seq='';
        $cut=0;
    }
    if ($seq =~ /[Nn]/) {
        return 0;
    }
    my $cat;
    if ($gap_info{$gap}{'std'}==0){
        $cat = 'abyss+allpaths';
    }else{
        $cat = 'abyss_only';
    }
    my $gap_size = length($seq);
    
    Save($gap, $cat,$gap_size, $cut, $cut, $seq, {$gap=>$seq}, [$gap]);
    return 1;
}

#------------subroutines to read blast outputs---------------
sub parseblast { # read the blast results and save unitig overlapping information in %overlaps
   my $blast_results = shift; # input the filename of the blast output
   my $storage =shift;
   open (my $f2, "<$blast_results"); 
   while (my $line = <$f2>){
      chomp $line;
      my @coln = split(/\t/, $line);# the blast output file should use the format of "-outfmt 6"
      my $query =$coln[0];
      my $hit =$coln[1];
      my @loci = ($coln[6], $coln[7], $coln[8], $coln[9]); # the loci defining the overlapping of the two unitigs
      if (itself($query, $hit, \@loci)){ # remove the rows in which query and hit are identical and the overlapping is the whole sequence
         next;
      }
      if ($blast_results eq "uxc.txt"){
        store ($query."F", $hit, \@loci, $storage, "c");
        next;} # if it is unitigs versus contigs, skip the following steps.
      store ($query."F", $hit, \@loci, $storage, "u");
      # store the results of unitig $query in "F" direction;
      # "F" means the unitig is in the same orientation as it is in the unitig file.
      # "R" means the unitig is in the opposite orientation to its original orientation as existing in the unitig file.
      my @loci_rev = Rev_blast($query, \@loci); # Reverse the direction of the query unitig and deduce the overlapping information accordingly.
      store ($query."R", $hit, \@loci_rev, $storage,"u"); # store the results of unitig $query in "R" direction.
   }
}

sub store{ # store overlapping information
   my $query = shift;
   my $unitigID = substr($query,0,length($query)-1);
   my $size_query = $unitig_info{$unitigID}{'size'};
   my $hit = shift;
   my $old =shift;
   my $loci = [@$old];
   my $storage = shift;
   my $code = shift;
   my $size_hit;
   if ($$loci[2]<=$$loci[3]){ # if the target is in F direction
         $hit = "$hit"."F"; # elongate the path
   }else{# if the target is in R direction
   
         if ($code eq "u") {
            $size_hit = $unitig_info{$hit}{'size'};
         }else{
            my $seq = $contigs{$hit};
            $size_hit = length($seq);
         }
         $$loci[3] = $size_hit - $$loci[3]+1; # convert the loci 
         $$loci[2] = $size_hit - $$loci[2]+1;
         $hit = "$hit"."R";# elongate the path
   }

   if ($code eq 'c') {
        if ($$storage{$query}){
            my @old_hit = keys(%{$$storage{$query}});
            my $old_loci = $$storage{$query}{$old_hit[0]};
            if (($$old_loci[1]-$$old_loci[0]) ==($$loci[1]-$$loci[0])) {
                delete($$storage{$query})
            }
        }else{
             $$storage{$query}{$hit}=$loci;
        }
   }else{
        if (($$loci[1]==$size_query) and ($$loci[2]==1)) { # only consider the perfect overlap between the two contigs.
            if ($$storage{$query}{$hit}){
               if (betterhit($$storage{$query}{$hit},$loci)){
                  $$storage{$query}{$hit}=$loci;
               }
            }else{
                 $$storage{$query}{$hit}=$loci;
            }
        }
   }    
}

sub betterhit{# when there are multiple overlapping regions between two unitigs, keep the one that has less disagreement and longer overlapping.
    my @a =@{$_[0]};
    my $size_a = $a[1]-$a[0];
    my @b =@{$_[1]};
    my $size_b = $b[1]-$b[0];
    if ($size_a == $size_b) {
        return 0;
    }else{
        return 1;
    }
}

sub betterhit_c{# when there are multiple overlapping regions between two unitigs, keep the one that has less disagreement and longer overlapping.
    my @a =@{$_[0]};
    my $size_a = $a[1]-$a[0];
    my @b =@{$_[1]};
    my $size_b = $b[1]-$b[0];
    if ($size_a <$size_b) {
        return 1;
    }elsif($size_a=$size_b){
        return -1;
    }else{
        return 0;
    }
}

sub Rev_blast{ # deduce overlapping information when the query unitig is reversed.
   my $query = $_[0];
   my @loci = @{$_[1]};
   my $size = $unitig_info{$query}{'size'};
   return ($size-$loci[1]+1, $size-$loci[0]+1, $loci[3], $loci[2]); # this is how the new loci for the reverse query is determined.
}

sub itself{ # determine if the query and the hit/target in blast results are identical
   my $query = shift;
   my $hit = shift;
   my $loci = shift;
   my $size_query = abs($$loci[1]-$$loci[0])+1;
   my $size_hit = abs($$loci[3]-$$loci[2])+1;
   my $length_query = $unitig_info{$query}{'size'};
   my $length_hit = $unitig_info{$hit}{'size'};
   if (($query eq $hit) and ($size_query == $size_hit) and ($length_query == $size_query) and ($length_hit == $size_hit)) { # case 1, e.g. 001W -> 001W
      return 1;
   }
   if ($query =~ /[WE]$/){
      $query = substr($query, 0, length($query)-1);
   }
   if ($hit =~ /[WE]$/){
      $hit = substr($hit, 0, length($hit)-1);
   }
   if (($query eq $hit) and ($size_query == $size_hit) and ($size_query == $edge_size)){ # case 2. 001E to 001F (1,2000) or reverse; or 001W to 001F (15145,17144)
        return 1;
   }
   return 0;
}

#------------subroutines for solution searching--------------
# Search($fst_paths, \%space, $gap_size_deviation, 1);

sub Search{ # the structure of the main seach function.my 
   my $paths = shift; # input a reference to an array of paths to continue the search
   my $space = shift; # the key to the search space hash
   my $gap_size_deviation = shift; # the value of the allowed size deviation
   my $mode = shift; # 1 limited search, 0 exhaustive search
   foreach my $path (@$paths){ # iterating through all paths in the search.
      my $newpaths = blast($path,$space); # input new paths continued from the current path
      $newpaths=refine ($newpaths, $path, $gap_size_deviation, $mode); # refine and filter new paths based on certain criteria
      $newpaths=check ($newpaths, $gap_size_deviation); # check if some new paths have filled the gap; if so, they are ouput as solutions
      if ($newpaths){ # if there are unresolved new paths,those that can't fill the gap, continue the search; otherwise, return.
         Search ($newpaths, $space, $gap_size_deviation, $mode); # this search function calls itself for iterations until no unresolved new paths to continue the search.
      }  
   }
}

sub blast{ # find new overlapping unitigs to extend the path. return a hash of new paths continuing from the query pathed
   my $path = [@{$_[0]}]; # create a reference to the copy of the path array
   my $space = $_[1];
   my $last = scalar(@{$_[0]})-1; # the index of the last step
   my $last_step = $path->[$last]->[0]; # 167F
   my $last_unitig = substr($last_step,0,length($last_step)-1);
   my $query_size = $unitig_info{$last_unitig}{'size'};
   my $newpaths; # a hash of new paths
   if (!$$space{$last_step}){ # if this unitig, 167F, hits to nothing, return an empty hash.
     return ($newpaths);
   }
   my @blastresults = keys%{$$space{$last_step}}; # else, retrieve the blast results of this unitig, 167F, from the hash of %overlap.
   
   foreach my $i (@blastresults){ # iterate through different blast hits

      my $newstep = $i; # the id of the hit is saved as the last element of the array. pop function also delete the last element of the array
      # it becomes (query-start, quesry-end, target-start, target-end).
      # Determine if the target/hit is in reverse (F) or forward (R) orientation when aligning with the query unitig.
      if ($newstep =~ /228/) {
        my $a =1;
      }
      
      my @loci = @{$$space{$last_step}{$newstep}}; # each blast hit was saved as an array, represented as a pointer $i; retrieve the array by evaluing the array pointer.
      #(query-start, quesry-end, target-start, target-end, the target);
      my $next = [$newstep, \@loci];
      my $newpath = [@$path];
      push (@$newpath, $next);
      push (@$newpaths, $newpath);
   }
   return ($newpaths);
}

sub refine{ # remove problomatic paths to narrow the searching space.
   my $newpaths = shift; # a hash of new paths (daughter paths)
   my $oldpath = shift; # the mother path, in which these daughter paths were derived.
   my $gap_size_deviation=shift;
   my $mode = shift;
   my %oldsteps;
   for (my $i=0;$i<scalar(@$oldpath);$i++){
    $oldsteps{$oldpath->[$i]->[0]}=$i+1;
   }
   my $candidates;
   foreach my $newpath (@$newpaths){
      my $step_count = scalar(@$newpath);
      if ($step_count>300) {
        next;
      }
      
      my $new = $step_count -1;
      my $old = $step_count -2;
      my $newstep = $newpath->[$new]->[0]; #the last step
      my $oldstep = $newpath->[$old]->[0];# the 2nd last step
      my $oldunitig = substr($oldstep, 0, length($oldstep)-1);
      my $newunitig = substr($newstep, 0, length($newstep)-1);
      my $newloci = $newpath->[$new]->[1];
      my $oldloci = $newpath->[$old]->[1];
      my $size_newunitig = $unitig_info{$newunitig}{'size'};
      #check if the new step is the edge of another contig within scaffold
      if (($newstep ne $end) and $edges{$newunitig}){next;}
      #check if the gap size is within allowed range.
      my $progress2 = $$newloci[0]-$$oldloci[2];
      my $extension_size = $$oldloci[4]+$progress2;
      push (@$newloci,$extension_size);
      my $expected_size = $gap_info{$gap}{'size'};
      my $max_size_allowed = $expected_size + $gap_size_deviation;
      if ($extension_size>$max_size_allowed){next;}
      # record and control loops
      if ($oldsteps{$newstep}){
        Record_loop($newpath, $oldsteps{$newstep});
        next;
      }
      # control the search mode: 0 exhaustive or 1 limited
      if ($mode == 1 ) { #
            if ($bridging_unitigs{$newstep}){
                push(@$newpath, @{$bridging_unitigs{$newstep}});
                if (CheckGapSize($newpath, $gap_size_deviation)){
                   push (@solutions, $newpath);
                   StoreTies($newpath);
                }
                next;
            }
      }
      
      
      push (@$candidates, $newpath);
   }
   return ($candidates);#return candidate paths that passed the filters to continue searching
}

sub Record_loop{
    my $newpath=shift;
    my $n=shift;
    my $looppath;
    my $start_unitig = $newpath->[$n-1]->[0];
    my @paths;
    for (my $i =$n-1; $i<scalar(@$newpath); $i++){
        push(@$looppath, $newpath->[$i]);
        push (@paths, $newpath->[$i]->[0]);
    }
    my $loop_path = join('-', @paths);
    if (!$loop_paths{$loop_path}){
        push(@{$loops{$start_unitig}}, $looppath);
        $loop_paths{$loop_path}++;
    }
}

sub check{#to check for paths that close the gap
   my $newpaths= shift;
   my $gap_size_deviation=shift;
   my $candidates;
   foreach my $newpath (@$newpaths){
      my $last = scalar(@{$newpath})-1; # the index of the last step
      my $newstep = $newpath->[$last]->[0]; # 167F
      if ($newstep eq $end){
         if (CheckGapSize($newpath, $gap_size_deviation)){
            push (@solutions, $newpath);
            StoreTies($newpath);
         }
         next;
      }
      push (@$candidates, $newpath);
   }
   return($candidates); #return candidate paths to continue searching
}


sub CheckGapSize{ # check if a solution has a length within a size range
    #return 1;
    my $path =shift;
    my $size =EstimateDist($path,$startx,$endx);
    my $expec_size = $gap_info{$gap}{'size'};
    my $expec_std=shift;
    
    if ($expec_std ==0) {
        $expec_std =20;
    }
    if ($size <=($expec_size+$expec_std)){
        return 1;
    }else{
        return 0;
    }
}

sub EstimateDist{
    my $path =shift;
    my $startstep=shift;
    my $endstep=shift;
    my $size;
    my $last = scalar(@$path)-1;
    my $lastloci = $path ->[$last]->[1];
    my $laststep = $path->[$last]->[0];
    my $firststep = $path->[0]->[0];
    my $size_1st = $steps{$firststep}{'size'};
    my $size_1stx= $steps{$startstep}{'size'};
    my $size_last =$steps{$laststep}{'size'};
    my $size_lastx= $steps{$endstep}{'size'};
    
    # $$newloci[0]-$$oldloci[2];
    for (my $i = 0; $i<scalar(@$path)-1; $i++){
        my $stepwise = ($path->[$i+1]->[1]->[0])-($path->[$i]->[1]->[2]);
        $size += $stepwise;
    }
    $size= $size-$$lastloci[2]-$size_1st+($size_1st+$size_last-$size_1stx-$size_lastx);
    return ($size);
}


sub StoreTies{ # add the unitigs within a solution to a hash for future comparison.
    my $path =[@{$_[0]}];
    my $stack = [@{$_[0]}];
    for (my $i=0; $i<scalar(@$path)-1; $i++){
        my @loci = @{$path->[$i+1]->[1]};
        pop @loci;
        # enrole new ties
        $ties{$path->[$i]->[0]}{'out'}{$path->[$i+1]->[0]}=\@loci;
        $ties{$path->[$i+1]->[0]}{'in'}{$path->[$i]->[0]}=\@loci;
        # recode used unitigs
        if ($path->[$i]->[0] =~/[WE]/) {
            shift(@$stack);
            next;
        }else{
            shift(@$stack);
            my $subseq = [@$stack];
            if (!$bridging_unitigs{$path->[$i]->[0]}) {
                $bridging_unitigs{$path->[$i]->[0]}=$subseq;
            }
        }           
    }
}


#other small subroutines.

sub RC_DNA{ # reverse complement DNA sequences
   my $seq = $_[0];
    $seq = reverse($seq);
    $seq =~ tr/ATUGCYRSWKMBDHVNatugcyrswkmbdhvn/TAACGRYSWMKVHDBNtaacgryswmkvhdbn/;# not include other characters of ambiguity, like M
    return $seq;
}

sub round_digit{
    my $n =shift;
    my $tot = shift;
    my $digits = int(log($tot)/log(10))+1;
    my $m = int(log($tot)/log(10))-int(log($n)/log(10));
    my $zeros = "0"x$m;
    return ($zeros.$n);
}

# subroutine taken from a online script getopts.pl
sub Getopts {
    local($argumentative) = @_;
    local(@args,$_,$first,$rest);
    local($errs) = 0;

    @args = split( / */, $argumentative );
    while(@ARGV && ($_ = $ARGV[0]) =~ /^-(.)(.*)/) {
		($first,$rest) = ($1,$2);
		$pos = index($argumentative,$first);
		if($pos >= 0) {
			if($args[$pos+1] eq ':') {
				shift(@ARGV);
				if($rest eq '') {
					++$errs unless(@ARGV);
					$rest = shift(@ARGV);
				}
				eval "
				push(\@opt_$first, \$rest);
				if (!defined \$opt_$first or \$opt_$first eq '') {
					\$opt_$first = \$rest;
				}
				else {
					\$opt_$first .= ' ' . \$rest;
				}
				";
			}
			else {
				eval "\$opt_$first = 1";
				if($rest eq '') {
					shift(@ARGV);
				}
				else {
					$ARGV[0] = "-$rest";
				}
			}
		}
		else {
			print STDERR "Unknown option: $first\n";
			++$errs;
			if($rest ne '') {
				$ARGV[0] = "-$rest";
			}
			else {
				shift(@ARGV);
			}
		}
	}
    $errs == 0;
}

